# International sunspot number from SIDC, Brussels (1818 - 2024)
SIDC_SILSO <- read_delim("data/raw/SIDC_SILSO.csv", delim = ";", col_names = FALSE) %>%
transmute(Date = as.Date(paste0(X1, "-", X2, "-", X3)),
Sunspots = as.numeric(X5))
## Ambulance Data related to acute cerebrovascular accidents
brain_data <- read_csv("data/raw/brain_data.csv") %>%
mutate(across(c(Year, Season, Month, DayOff, COVID, Sudden_Start, Dst_level), as.factor))
## Ambulance Long Data related to acute cerebrovascular accidents
brain_data_long <- read_csv("data/raw/brain_data_long.csv") %>%
mutate(across(c(Year, Season, Month, DayOff, COVID, Sudden_Start, Dst_level, Sex, Age), as.factor))
Our dataset contains a large number of quantitative variables:
# Data preprocessing
brain_data_clear <- brain_data %>%
select( where(is.numeric) & !ends_with(c("min", "max", "var", "K_Sum")) ) %>%
filter(complete.cases(.))
# Function for displaying correlations with a neutral color for values close to 0
color_correlation_fixed <- function(data, mapping, ...) {
x <- eval_data_col(data, mapping$x)
y <- eval_data_col(data, mapping$y)
cor_value <- cor(x, y, use = "complete.obs")
# Defining color
color <- if (abs(cor_value) < 0.1) {
scales::alpha("gray50", 0.5) ## Gray for weak correlations
} else if (cor_value > 0) {
scales::alpha("blue", abs(cor_value)) ## Blue for positive correlations
} else {
scales::alpha("red", abs(cor_value)) ## Red for negative correlations
}
# Filter for small correlation values
cor_label <- ifelse(abs(cor_value) < 0.01, "0", sprintf("%.2f", cor_value))
ggplot(data.frame()) +
annotate("text", x = 0.5, y = 0.5, label = cor_label, size = 10, color = color ) +
theme_void()
}
# Function for trend line with a golden color
golden_smooth <- function(data, mapping, ...) {
ggplot(data, mapping) +
geom_point(alpha = 0.4, size = 0.5, color = "black") +
geom_smooth(color = "gold", ...) +
theme_custom
}
# Plot construction
ggpairs(
brain_data_clear,
progress = FALSE,
upper = list(continuous = color_correlation_fixed), ## Highlighting correlations
lower = list(continuous = golden_smooth) )+ ## Golden trend line
theme_custom +
theme(
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10) )
When analyzing the network graph, three groups of features can be identified:
cor_data <- brain_data %>%
select( where(is.numeric) & !ends_with(c("min", "max", "Sum", "var")) ) %>%
rename(Temperature = Temp_mean, Precipitation = H2O,
`F10.7 index` = F10.7, `Ap index` = Ap_mean, `Dst index` = Dst_mean) %>%
cor(use = "pairwise.complete.obs")
corrplot(cor_data, method = 'number', type = 'lower', diag = FALSE)
network_plot(cor_data, min_cor = .1)
Some features within the groups are highly correlated!
brain_data %>%
drop_na() %>%
ggplot(aes(Temp_mean, Pressure))+
geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
stat_cor(method = "pearson", label.x.npc = 0.6, label.y.npc = 0.9, size = 7)+
labs(x = "Temperature")+
theme_custom
brain_data %>%
ggplot(aes(Sunspots, F10.7))+
geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
stat_cor(method = "pearson", label.x.npc = 0, label.y.npc = 0.9, size = 7)+
labs(x = "Sunspot Number", y = "F10.7 index")+
theme_custom
brain_data %>%
ggplot(aes(Dst_mean, Ap_mean))+
geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
stat_cor(method = "pearson", label.x.npc = 0.6, label.y.npc = 0.9, size = 7)+
labs(x = "Daily Average Dst index", y = "Daily Average Ap index")+
theme_custom
Based on further analysis, the following variables will be excluded:
However, all the variables considered do not have a noticeable relationship with the number of ambulance calls:
brain_data %>%
rename(Temperature = Temp_mean, `Ap index` = Ap_mean, `Dst index` = Dst_mean) %>%
pivot_longer(cols = c(Temperature, Sunspots, `Ap index`, `Dst index`)) %>%
ggplot(aes(x = value, y = EMS))+
geom_point()+
geom_smooth(se = FALSE, method = "gam", colour = "goldenrod2", linewidth = 2)+
theme_custom +
labs(x = "", y = "Daily Emergency Calls") +
facet_wrap(~name, scales = "free")
The number of sunspots is directly linked to the solar activity cycles, as confirmed by the graph:
brain_data %>%
select(!c(Pressure, F10.7, Ap_mean)) %>%
ggplot(aes(Date, Sunspots))+
geom_line(alpha = 0.3, colour = "royalblue2")+
geom_smooth(method = "gam", formula = y ~ s(x, k = 10), se = FALSE,
colour = "orangered4", linewidth = 2) +
geom_vline(xintercept = as.Date("2009-01-01"),
colour = "orangered", linewidth = 1, linetype = "dashed")+
geom_vline(xintercept = as.Date("2019-12-31"),
colour = "orangered", linewidth = 1, linetype = "dashed")+
scale_x_date(date_breaks = "2 year", date_labels = "%Y")+
labs(x = "", y = "Sunspot number")+
theme_custom
Our data covers the complete 24th solar cycle, spanning the period from December 2008 to December 2019.
Working with time series allows us to decompose the data to identify trends and cyclic patterns:
ggarrange(
brain_data %>% filter(Date >= ymd("2007-01-01")) %>% pull(Sunspots) %>%
ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>%
autoplot() + theme_custom + ggtitle("Sunspots"),
brain_data %>% filter(Date >= ymd("2007-01-01")) %>% pull(EMS) %>%
ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>%
autoplot() + theme_custom + ggtitle("EMS"),
brain_data %>% filter(Date >= ymd("2007-01-01")) %>% pull(Temp_mean) %>%
ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>%
autoplot() + theme_custom + ggtitle("Temperature"),
nrow = 1
)
ggarrange(
brain_data %>% filter(Date >= ymd("2007-01-01")) %>% pull(Sunspots) %>%
ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>%
autoplot() + theme_custom + ggtitle("Sunspots"),
brain_data %>% filter(Date >= ymd("2007-01-01")) %>% pull(Ap_mean) %>%
ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>%
autoplot() + theme_custom + ggtitle("Ap Index"),
brain_data %>% filter(Date >= ymd("2007-01-01")) %>% pull(Dst_var) %>%
ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>%
autoplot() + theme_custom + ggtitle("Dst amplitude"),
nrow = 1
)
Solar activity shows cycles, while EMS follows a linear trend, as seen in the decomposition. However, there’s no significant difference when looking at the data by month over this period:
ggarrange(
brain_data %>%
filter(Date %within% interval(ymd("2007-01-01"), ymd("2019-12-31"))) %>%
group_by(Year, Month) %>%
summarise(EMS_month = sum(EMS, na.rm = TRUE), .groups = "drop") %>%
ggplot(aes(x = Month, y = Year, fill = EMS_month)) +
geom_tile(colour = "black") +
scale_fill_gradient(low = "green", high = "red") +
scale_x_discrete(labels = month.abb) +
labs(x = "", y = "", fill = "", title = "Total Number of Emergency Calls")+
theme_custom,
brain_data %>%
filter(Date %within% interval(ymd("2007-01-01"), ymd("2019-12-31"))) %>%
group_by(Year, Month) %>%
summarise(Sunspots_month = sum(Sunspots, na.rm = TRUE), .groups = "drop") %>%
ggplot(aes(x = Month, y = Year, fill = Sunspots_month)) +
geom_tile(colour = "black") +
scale_fill_gradient(low = "green", high = "red") +
scale_x_discrete(labels = month.abb) +
labs(x = "", y = "", fill = "", title = "Total Number of Sunspots")+
theme_custom
)
When examining solar cycles over the past 200 years, sunspots can be represented as a factor variable - Sunspots level:
Sunspots_quantile <- SIDC_SILSO %>%
filter(Sunspots != -1, between(Date, as.Date("1823-01-01"), as.Date("2020-01-01"))) %>%
pull(Sunspots) %>% quantile()
SIDC_SILSO %>%
filter(Sunspots != -1,
between(Date, as.Date("1823-01-01"), as.Date("2020-01-01"))) %>%
mutate(Sunspots_level = cut(Sunspots,
breaks = c(-Inf, quantile(Sunspots)[2], quantile(Sunspots)[3],
quantile(Sunspots)[4], Inf), labels =
c("Low", "Medium", "High", "Extreme"))) %>%
ggplot(aes(Date, Sunspots, group = 1))+
geom_line(aes(color = Sunspots_level), linewidth = 2)+
geom_hline(yintercept = Sunspots_quantile[2], colour = "black", linewidth = 1, linetype = "dashed")+
geom_hline(yintercept = Sunspots_quantile[3], colour = "black", linewidth = 1, linetype = "dashed")+
geom_hline(yintercept = Sunspots_quantile[4], colour = "black", linewidth = 1, linetype = "dashed")+
scale_colour_manual(values = c("Low" = "darkorange", "Medium" = "darkorange2",
"High" = "darkorange3", "Extreme" = "darkorange4"),
labels = c("Low" = "Low (0-22)", "Medium" = "Medium (23-65)",
"High" = "High (66-129)","Extreme" = "Extreme (>129)")) +
scale_x_date( date_breaks = "20 year", date_labels = "%Y")+
scale_y_continuous(n.breaks = 10)+
labs(x = "", y = "Sunspot number", color = "Sunspots level", title = "Solar Cycles (1823 - 2023)")+
theme_custom+
annotate("text", x = as.Date("1815-01-01"),
y = c( (Sunspots_quantile[1] + Sunspots_quantile[2])/2, (Sunspots_quantile[2] + Sunspots_quantile[3])/2,
(Sunspots_quantile[3] + Sunspots_quantile[4])/2, (Sunspots_quantile[4] + Sunspots_quantile[5])/3),
label = c("Q1", "Q2", "Q3", "Q4"), size = 10)
No difference in the number of emergency calls on days with different Sunspots_level is observed:
brain_data %>%
mutate(Sunspots_level = case_when(Sunspots > Sunspots_quantile[4] ~ "Extreme",
Sunspots > Sunspots_quantile[3] ~ "High",
Sunspots > Sunspots_quantile[2] ~ "Medium",
TRUE ~ "Low") %>% factor(fct_recode(c("Low", "Medium", "High", "Extreme")))) %>%
ggplot(aes(x = EMS, y = Sunspots_level, fill = Sunspots_level )) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(
values = c("Low" = "darkorange", "Medium" = "darkorange2", "High" = "darkorange3","Extreme" = "darkorange4"),
labels = c("Low" = "Low (0-22)", "Medium" = "Medium (23-65)",
"High" = "High (66-129)","Extreme" = "Extreme (>129)")) +
labs(x = "Daily Ambulance Calls", y = "", fill = "Sunspots level",
title = "Relationship Between Sunspot Levels and Ambulance Calls")+
theme_custom
brain_data %>%
mutate(Sunspots_level = case_when(Sunspots > Sunspots_quantile[4] ~ "Extreme",
Sunspots > Sunspots_quantile[3] ~ "High",
Sunspots > Sunspots_quantile[2] ~ "Medium",
TRUE ~ "Low") %>% factor(fct_recode(c("Low", "Medium", "High", "Extreme")))) %>%
pairwise_t_test(EMS ~ Sunspots_level, p.adjust.method = "holm") %>%
transmute(`Group 1` = group1, `Group 2` = group2,
`p-value` = p, `p.adjusted` = p.adj) %>%
flextable()
Group 1 | Group 2 | p-value | p.adjusted |
|---|---|---|---|
Low | Medium | 0.0549 | 0.330 |
Low | High | 0.0675 | 0.338 |
Medium | High | 0.9470 | 1.000 |
Low | Extreme | 0.1730 | 0.691 |
Medium | Extreme | 0.7710 | 1.000 |
High | Extreme | 0.8090 | 1.000 |
At the same time, a noticeable difference in the Dst index is observed as the sunspot number changes:
brain_data %>%
filter(Dst_level != "Severe") %>%
mutate(Dst_level = fct_relevel(Dst_level, c("Quiet", "Weak", "Moderate", "Major", "Severe"))) %>%
ggplot(aes(Dst_level, Sunspots))+
geom_jitter(alpha = 0.5, size = 1, width = 0.2) +
geom_boxplot(alpha = 0.5, outliers = FALSE, fill = "goldenrod2", linewidth = 1)+
labs( x = "Geomagnetic activity level", y = "Sunspot Number" ) +
theme_custom
mean_values <- brain_data %>%
mutate(DayOff = fct_recode(DayOff, "Working days" = "No", "Non-working days" = "Yes")) %>%
group_by(DayOff) %>%
summarise(mean_EMS = round(mean(EMS, na.rm = TRUE), 2), .groups = "drop")
brain_data %>%
mutate(DayOff = fct_recode(DayOff, "Working days" = "No", "Non-working days" = "Yes")) %>%
ggplot(aes(x = DayOff, y = EMS)) +
geom_boxplot(aes(fill = DayOff)) +
geom_text(data = mean_values,
aes(x = DayOff, y = mean_EMS, label = mean_EMS),
color = "black", size = 8, fontface = "bold", vjust = -1) +
scale_fill_brewer(palette = "Dark2", direction = -1)+
scale_y_continuous(limits = c(NA, max(brain_data$EMS) + 3))+
labs(y = "Daily Emergency Calls", x = "")+
theme_custom +
theme(legend.position = "none")+
stat_pvalue_manual(t_test(data = brain_data %>%
mutate(DayOff = fct_recode(DayOff,
"Working days" = "No",
"Non-working days" = "Yes")),
EMS ~ DayOff),
label = "T-test, p = {p}",
size = 10,
y.position = max(brain_data$EMS) + 0.1)
The distribution of EMS calls across seasons is not statistically significant (paired t-test with Holm adjustment) p.adjusted > 0.05).
brain_data %>%
mutate(Season = fct_relevel(Season, c("Winter", "Spring", "Summer", "Autumn"))) %>%
ggplot(aes(x = Season, y = EMS, fill = Season)) +
geom_boxplot(alpha = 0.7, show.legend = FALSE) +
scale_fill_manual(
values = c("Winter" = "dodgerblue2", "Spring" = "palegreen2", "Summer" = "tan2","Autumn" = "tomato2")) +
labs(y = "Daily Emergency Calls", x = "")+
theme_custom
brain_data %>%
pairwise_t_test(EMS ~ Season, p.adjust.method = "holm") %>%
transmute(`Group 1` = group1, `Group 2` = group2,
`p-value` = p, `p.adjusted` = p.adj) %>%
flextable()
Group 1 | Group 2 | p-value | p.adjusted |
|---|---|---|---|
Autumn | Spring | 0.0162 | 0.0969 |
Autumn | Summer | 0.1300 | 0.5200 |
Spring | Summer | 0.3670 | 1.0000 |
Autumn | Winter | 0.0259 | 0.1290 |
Spring | Winter | 0.8650 | 1.0000 |
Summer | Winter | 0.4670 | 1.0000 |
ggarrange(
brain_data_long %>%
filter(between(as.numeric(as.character(Year)), 2007, 2021), Age != "55+") %>%
group_by(Year, Sex, Age) %>%
summarise(EMS_total = sum(EMS, na.rm = TRUE), .groups = "drop") %>%
mutate(EMS_plot = ifelse(Sex == "Male", -EMS_total, EMS_total)) %>%
ggplot(aes(x = Year, y = EMS_plot, fill = Sex)) +
geom_bar(stat = "identity", alpha = 0.7, colour = "black", show.legend = FALSE) +
scale_y_continuous(labels = abs, n.breaks = 10) +
scale_fill_manual(
values = c("Male" = "steelblue", "Female" = "pink"),
labels = c("Male" = "Male", "Female" = "Female") ) +
coord_flip() +
labs(
title = "Ambulance Calls Distribution by Year, Age and Gender",
x = "", y = "", fill = "Gender") +
theme_custom +
facet_wrap(. ~ Age, scales = "free_x",
labeller = labeller(Age = c("25-44" = "Age: 25-44 years",
"45-54" = "Age: 45-54 years"))),
brain_data_long %>%
filter(between(as.numeric(as.character(Year)), 2007, 2021), Age == "55+") %>%
group_by(Year, Sex, Age) %>%
summarise(EMS_total = sum(EMS, na.rm = TRUE), .groups = "drop") %>%
mutate(EMS_plot = ifelse(Sex == "Male", -EMS_total, EMS_total)) %>%
ggplot(aes(x = Year, y = EMS_plot, fill = Sex)) +
geom_bar(stat = "identity", alpha = 0.7, colour = "black", ) +
scale_y_continuous(labels = abs, n.breaks = 10) +
scale_fill_manual(
values = c("Male" = "steelblue", "Female" = "pink"),
labels = c("Male" = "Male", "Female" = "Female") ) +
coord_flip() +
labs(
x = "", y = "Total EMS", fill = "Gender") +
theme_custom + theme(legend.position = "inside", legend.justification = c(0.9, 0.2))+
facet_wrap(. ~ Age, scales = "free_x",
labeller = labeller(Age = c("55+" = "Age: 55+ years"))),
nrow = 2
)
Daily Average Kp index and Daily Average K index (IRT) show a strong correlation (r = 0.91). However, K index (IRT) has a smaller range and a more uniform distribution compared to Kp index.
brain_data %>%
drop_na(Kp_Sum, K_Sum) %>%
ggplot(aes(Kp_Sum, K_Sum))+
geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
stat_cor(method = "pearson", label.x.npc = 0, label.y.npc = 0.9, size = 7)+
labs(x = "Daily Average Kp index", y = "Daily Average K index (IRT)")+
theme_custom
brain_data %>%
drop_na(Kp_Sum, K_Sum) %>%
pivot_longer(cols = c(Kp_Sum, K_Sum), names_to = "index") %>%
ggplot() +
geom_histogram(aes(x = value, fill = index),
binwidth = 5, colour = "black", alpha = 0.5, position = "identity", show.legend = FALSE) +
scale_fill_manual(
values = c("K_Sum" = "darkseagreen2", "Kp_Sum" = "steelblue2") )+
scale_x_continuous(breaks = seq(0, 50, 10))+
labs( x = "Index Values", y = "Observation Count", fill = "Index Type") +
theme_custom +
facet_wrap(~index, labeller = labeller(index = c("K_Sum" = "Daily Average K index (IRT)",
"Kp_Sum" = "Daily Average Kp index")))